library(data.table) #version 1.11.4
library(ggplot2) #version 3.1.0
library(RColorBrewer) #1.1-2
library(foreach) #1.4.4

#set your working directory

keyword_ls <- c("terrorist", "verkoudheid", 
                "influenza", "hooikoorts",
               "griep", "festival",
               "hacking", "depressie", 
               "files", "datalek")

#function to calibrate the collected raw GT data for a specific key term for both 2016 and 2017 the same time
#the calibration method is described in the paper
parse_GT <- function(keyword) {
  name16 <- paste0("GT16_", keyword, ".csv")
  name17 <- paste0("GT17_", keyword, ".csv")
  
  GT16 <- read_csv(name16) %>%
    dplyr::select(-X1) %>%
    gather(key = sample, value = pop, -date, -keyword)
  
  colnames17 <- c("date","keyword", paste("sample", 367:974, sep = ""))
  GT17 <- read_csv(name17) %>%
    dplyr::select(-X1)
  colnames(GT17) <- colnames17
  GT17 <- GT17 %>%
    gather(key = sample, value = pop, -date, -keyword)
  
  
  GT1617 <- GT16 %>%
    bind_rows(GT17) %>%
    filter(!is.na(pop)) %>%
    spread(sample, pop)
  
  
  for (i in 2:974) {  
    ref <- pull(GT1617, paste0("sample", i-1))
    ref <- ref[!is.na(ref)]
    ref <- ref[2:length(ref)]
    
    x <- pull(GT1617, paste0("sample", i))
    x <- x[!is.na(x)]
    x <- x[1:(length(x) - 1)]
    w <- sum(ref) / sum(x)
    GT1617[, paste0("sample", i)] <- GT1617[, paste0("sample", i)] * w
    
    if (i %% 100 == 0) {print(paste(keyword, i))}
  }
  
  GT1617_final <- GT1617 %>%
    gather(key = sample, value = pop, -date, -keyword) %>%
    group_by(date) %>%
    summarise(m = mean(pop, na.rm = T),
              med = median(pop, na.rm = T),
              q0.025 = quantile(pop, probs = 0.025, na.rm = T),
              q0.975 = quantile(pop, probs = 0.975, na.rm = T)) %>%
    mutate(term = keyword)
  
  file_name <- paste0("GT1617_", keyword, ".csv")
  #write_csv(GT1617_final, file_name)
  return(GT1617_final)
}

#function to calibrate the hooikoorts data (the only variable that has missing data)
parse_hooikoorts <- function(keyword) {
  name16 <- paste0("GT16_", keyword, ".csv")
  name17 <- paste0("GT17_", keyword, ".csv")
  
  GT16 <- read_csv(name16) %>%
    dplyr::select(-X1) %>%
    mutate(date = mdy(date)) %>%
    gather(key = sample, value = pop, -date, -keyword)
  
  colnames17 <- c("date","keyword", paste("sample", 367:974, sep = ""))
  GT17 <- read_csv(name17) %>%
    dplyr::select(-X1)
  colnames(GT17) <- colnames17
  GT17 <- GT17 %>%
    gather(key = sample, value = pop, -date, -keyword)
  
  
  GT1617 <- GT16 %>%
    bind_rows(GT17) %>%
    filter(!is.na(pop)) %>%
    spread(sample, pop)
  
  
  for (i in 2:974) {  
    ref <- pull(GT1617, paste0("sample", i-1))
    ref <- ref[!is.na(ref)]
    ref <- ref[2:length(ref)]
    
    x <- pull(GT1617, paste0("sample", i))
    x <- x[!is.na(x)]
    x <- x[1:(length(x) - 1)]
    w <- sum(ref) / sum(x)
    GT1617[, paste0("sample", i)] <- GT1617[, paste0("sample", i)] * w
    
    if (i %% 100 == 0) {print(paste(keyword, i))}
  }
  
  GT1617_final <- GT1617 %>%
    gather(key = sample, value = pop, -date, -keyword) %>%
    group_by(date) %>%
    summarise(m = mean(pop, na.rm = T),
              med = median(pop, na.rm = T),
              q0.025 = quantile(pop, probs = 0.025, na.rm = T),
              q0.975 = quantile(pop, probs = 0.975, na.rm = T)) %>%
    mutate(term = keyword)
  
  file_name <- paste0("GT1617_", keyword, ".csv")
  #write_csv(GT1617_final, file_name)
  return(GT1617_final)
}

#for example, try calibrate "datalek"
parse_GT(keyword_ls[10])

#calibrate all terms except for hooikoorts
GT1617_Full <- foreach(i = c(1:3, 5:10)) %do% parse_GT(keyword_ls[i])

#calibrate hooikoorts
GT1617_hooikoorts <- parse_hooikoorts("hooikoorts")

#output the final file containing all the variables
GT1617_Full %>%
  reduce(bind_rows) %>%
  bind_rows(GT1617_hooikoorts) %>%
  write_csv("GT1617_Full.csv")

